
from numpy.random import binomial
class MAB_Problem:
def __init__(self, T = 100000, K = 2, delta = 0.1):
self.T, self.K, self.delta = T, K, delta
self.mean = [0.5 + (self.delta if i == 0 else 0) for i in range(self.K)]
""" Caching the reward list """
self.rewards = [binomial(1, i_mean, int(T)) for i_mean in self.mean]
def start_game(self):
""" Move through the game """
for t in range(int(self.T)):
self.t = t
yield int(t)
del self.t
def play(self, k):
""" Get the reward """
return self.rewards[k][self.t]
The minimal number of samples needed to an algorithm to be $\epsilon$-$\delta$-PAC (Probably Approximately Correct) on a MAB problem is:
$$ \Omega \left( \frac{K \text{log}(\frac{1}{\delta})}{\epsilon^2} \right)$$$\delta$ has a low impact (it's in the log) but the number of samples needed has a quadratic dependency to $\epsilon$.
We know that the mean cumulative regret of any algorithm will be at least
$$ \Omega \left( \frac{K \text{log}(\frac{KT}{\Delta})}{\Delta} \right).$$
The problem becomes harder as $\Delta$ is smaller.
Hoeffding Inequality : The difference between the empirical mean reward of an arm and its mean reward is at most $\epsilon$, with a probability 1-$\delta$; $\epsilon$ being $$\epsilon = \sqrt{\frac{\text{log}\left(\frac{c t^2}{\delta}\right)}{t}}.$$
Fun fact: The empirical mean is an $\epsilon$-$\delta$-PAC estimator.
import math, numpy as np
def get_confidence_bound(t, delta = 0.1, c=1):
return np.sqrt(np.log(c*pow(t,2)/delta)/t)
fig.show()
class RandomBandit():
def __init__(self, K):
self.K = K
self.t = 0
self.counts = [[1] for k in range(K)] # Number of time each arm is played
self.rewards = [[0] for k in range(K)] # History of rewards
self.cumulative_reward = [0] # The global cumulative reward
def select(self):
return np.random.randint(self.K)
def observe(self, k_t, y_t):
self.t += 1
self.cumulative_reward.append(self.cumulative_reward[-1]+y_t)
for k in range(self.K):
self.counts[k].append(self.counts[k][-1] + (1 if k==k_t else 0))
self.rewards[k].append(self.rewards[k][-1] + (y_t if k==k_t else 0))
mab = MAB_Problem(delta = 0.05)
player = RandomBandit(2)
rewards = []
for i in mab.start_game():
k = player.select()
y = mab.play(k)
player.observe(k,y)
rewards.append(y)
fig_AB.show()
fig_ABC.show()
An arm $k$ is eliminated when
$$\hat{\mu}^\text{max} - \hat{\mu}_k > 2\epsilon $$with $$\epsilon = \sqrt{\frac{\text{log}\left(\frac{c t^2}{\delta}\right)}{t}}.$$
Concept: Against incertitude, be optimistic.
Algorithm: Playing the arm $k$ that maximizes
$$\text{ucb}(k) = \hat{\mu}_k + \sqrt{\frac{2\text{log}(t)}{t_k}}.$$class UCB():
def __init__(self, K):
self.K = K
self.T = 1
self.t = np.array([1]*K)
self.mean_rewards = np.array([0.]*K)
"""Visualization Stuff"""
self.empirical_rewards = [[] for i in range(self.K)]
self.confidence_bound = [[] for i in range(self.K)]
def _get_ucb(self, k):
return self.mean_rewards[k] + math.sqrt(2 * math.log(self.T) / self.t[k])
def select(self):
ucb = [self._get_ucb(i_k) for i_k in range(self.K)]
max_ = np.argmax(np.array(ucb))
"""Visualization Stuff"""
for i_k in range(self.K):
self.empirical_rewards[i_k].append(self.mean_rewards[i_k])
self.confidence_bound[i_k].append(ucb[i_k])
return max_
def observe(self, k_t, y_t):
self.mean_rewards[k_t] = float(y_t + self.t[k_t] * self.mean_rewards[k_t]) / float(self.t[k_t]+1)
self.t[k_t] += 1
self.T += 1
fig_ucb.show()
class TS():
def __init__(self, K, batch = 10):
self.K = K
self.T = 1
self.failure = np.array([1]*K)
self.success = np.array([1]*K)
"""Visualization Stuff"""
self.failures = [[1] for i in range(self.K)]
self.successes = [[1] for i in range(self.K)]
def _draw(self, k):
return np.random.beta(self.success[k],self.failure[k])
def select(self):
draw = [self._draw(i_k) for i_k in range(self.K)]
max_ = np.argmax(np.array(draw))
return max_
def observe(self, k_t, y_t):
self.failure[k_t] += 1 - y_t
self.success[k_t] += y_t
"""Visualization Stuff"""
for i_k in range(self.K):
self.failures[i_k].append(self.failure[i_k])
self.successes[i_k].append(self.success[i_k])
mab = MAB_Problem(delta = 0.05)
player = TS(2)
rewards = []
played_arm = []
for i in mab.start_game():
k = player.select()
y = mab.play(k)
player.observe(k,y)
rewards.append(y)
played_arm.append(k)
ts_distributions.show()
fig_ts.show()
The mean $\mu_{k,t}$ changes during the run.
Classic algorithms do not provide guarantee against non-stationnarity.
Rewards depend on context.
Several family of problems: